NB: The worksheet has beed developed and prepared by Maxim Romanov for the course “R for Historical Research” (U Vienna, Spring 2020).
The following are the libraries that we will need for this section. Install those that you do not have yet.
#install.packages("ggplot2", "LDAvis", "readr", "slam", "stringr", "tictoc", "tidytext", "tidyverse", "tm", "topicmodels")
# general
library(ggplot2)
# text analysis specific
library(readr)
library(slam)
library(stringr)
library(tidytext)
library(tidyverse)
library(tm)
library(topicmodels)
library(text2vec)
library(stylo)
library(cluster) # clustering algorithms
library(factoextra) # clustering algorithms & visualization
# extra
library(tictoc) # to time operationsd1864 <- read.delim("./data/dispatch_1864_filtered.tsv", encoding="UTF-8", header=TRUE, quote="", stringsAsFactors = FALSE)
d1864$date <- as.Date(d1864$date, format="%Y-%m-%d")
head(d1864)Let’s remove low freq items:
d1864_lowFreq <- d1864 %>%
unnest_tokens(word, text) %>%
count(word, sort=TRUE)
summary(d1864_lowFreq)## word n
## Length:58776 Min. : 1.00
## Class :character 1st Qu.: 1.00
## Mode :character Median : 2.00
## Mean : 61.63
## 3rd Qu.: 8.00
## Max. :278040.00
## word n
## Length:28107 Min. :1
## Class :character 1st Qu.:1
## Mode :character Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
Most of these low-frequency items are typos:
We can anti-join our corpus with lowFreq, which will remove them:
d1864_clean <- d1864 %>%
filter(type != "ad_blank") %>%
filter(type != "ad-blank") #%>% filter(type != "advert")
d1864_clean <- d1864_clean %>%
unnest_tokens(word, text) %>%
anti_join(lowFreq, by="word") %>%
group_by(id) %>%
count(word, sort=TRUE)
# unfiltered: 2,815,144
# filtered (>3): 2,749,078Additionally, we need to remove stop words, but first we need to identify them.
d1864_clean_FL <- d1864_clean %>%
group_by(word) %>%
summarize(freq=sum(n)) %>%
arrange(desc(freq))
d1864_clean_FLTo make things faster, you can remove top 50, 100, 150, 200 most frequent words, but this is a rather brutal way. Ideally, you want to curate your own stop word list that will be tuned to your texts. Below, I have taken top 500 words and manually removed everything that was meaningful (or, better, what I considered meaningful). Additionally, there are also NLP procedures that are designed to lemmatize words (i.e., reduce all words to their dictionary forms) and also do part-of-speech tagging, which allows to remove words categorically (for example, keeping only nouns, adjectives and verbs).
word <- c("the", "of", "and", "to", "in", "a", "that", "for", "on", "was", "is", "at", "be", "by", "from", "his", "he", "it", "with", "as", "this", "will", "which", "have", "or", "are", "they", "their", "not", "were", "been", "has", "our", "we", "all", "but", "one", "had", "who", "an", "no", "i", "them", "about", "him", "two", "upon", "may", "there", "any", "some", "so", "men", "when", "if", "day", "her", "under", "would", "c", "such", "made", "up", "last", "j", "time", "years", "other", "into", "said", "new", "very", "five", "after", "out", "these", "shall", "my", "w", "more", "its", "now", "before", "three", "m", "than", "h", "o'clock", "old", "being", "left", "can", "s", "man", "only", "same", "act", "first", "between", "above", "she", "you", "place", "following", "do", "per", "every", "most", "near", "us", "good", "should", "having", "great", "also", "over", "r", "could", "twenty", "people", "those", "e", "without", "four", "received", "p", "then", "what", "well", "where", "must", "says", "g", "large", "against", "back", "000", "through", "b", "off", "few", "me", "sent", "while", "make", "number", "many", "much", "give", "1", "six", "down", "several", "high", "since", "little", "during", "away", "until", "each", "5", "year", "present", "own", "t", "here", "d", "found", "reported", "2", "right", "given", "age", "your", "way", "side", "did", "part", "long", "next", "fifty", "another", "1st", "whole", "10", "still", "among", "3", "within", "get", "named", "f", "l", "himself", "ten", "both", "nothing", "again", "n", "thirty", "eight", "took", "never", "came", "called", "small", "passed", "just", "brought", "4", "further", "yet", "half", "far", "held", "soon", "main", "8", "second", "however", "say", "heavy", "thus", "hereby", "even", "ran", "come", "whom", "like", "cannot", "head", "ever", "themselves", "put", "12", "cause", "known", "7", "go", "6", "once", "therefore", "thursday", "full", "apply", "see", "though", "seven", "tuesday", "11", "done", "whose", "let", "how", "making", "immediately", "forty", "early", "wednesday", "either", "too", "amount", "fact", "heard", "receive", "short", "less", "100", "know", "might", "except", "supposed", "others", "doubt", "set", "works")
sWordsDF <- data.frame(word)
d1864_clean_minusSW <- d1864_clean %>%
anti_join(sWordsDF, by="word")## [1] 1759828 3
## [1] 1159214 3
From Wikipedia: In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1] It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general. tf–idf is one of the most popular term-weighting schemes today; 83% of text-based recommender systems in digital libraries use tf–idf. Variations of the tf–idf weighting scheme are often used by search engines as a central tool in scoring and ranking a document’s relevance given a user query. tf–idf can be successfully used for stop-words filtering in various subject fields, including text summarization and classification. One of the simplest ranking functions is computed by summing the tf–idf for each query term; many more sophisticated ranking functions are variants of this simple model.
df_TF_IDF <- d1864_clean_minusSW %>% # d1864_clean, d1864_clean_minusSW
bind_tf_idf(word, id, n) %>%
arrange(desc(tf_idf)) %>%
ungroup
df_TF_IDF## [1] "Suicides in France -- More than ten suicides take place every day in France; last year 4, 000 persons committed suicide."
Clustering is a method to break items into closely related groups—clusters. The code below show how our data can be broken into clusters with hierarchical clustering, using distance matrices. Hierarchical clustering is has rather high precision, but sensitive to outliers amd computationally expensive, which makes it nearly unusable with large datasets. K-means clustering is more suitable for large datasets, but struggles with uniform data (for example). In both cases you have to define the number of clusters.
In what follows, we take out TFIDF data, sample it, and run cluster analysis on our small sample. The chunk below simply prepares our data for analysis:
# RANDOMLY SELECT N ITEMS
set.seed(48965) # 4, 12, 48965
N = 100
sampledIDs <- sample(unique(df_TF_IDF$id), N)
sample_d1864_tfidf <- df_TF_IDF %>% filter(id %in% sampledIDs) %>%
select(id, word, tf_idf)
# CONVERT INTO DTM MATRIX
colnames(sample_d1864_tfidf) <- c("document", "term", "count")
sample <- tibble(sample_d1864_tfidf) %>% cast_dtm(document, term, count)
sample_matrix <- as.matrix(sample)
# CONVERT INTO REGULAR MATRIC AND CALCULATE DISTANCES
distanceMatrix <- dist.cosine(sample_matrix) # from library(stylo)
distanceMatrixHC <- as.dist(distanceMatrix)Now we can do the actual clustering. There are several clustering/linkage methods that can be used for clustering, and it usually depends on your goals. From ?hclust: “A number of different clustering methods are provided. Ward’s minimum variance method aims at finding compact, spherical clusters. The complete linkage method finds similar clusters. The single linkage method (which is closely related to the minimal spanning tree) adopts a ‘friends of friends’ clustering strategy. The other methods can be regarded as aiming for clusters with characteristics somewhere between the single and complete link methods. Note however, that methods”median" and “centroid” are not leading to a monotone distance measure, or equivalently the resulting dendrograms can have so called inversions or reversals which are hard to interpret, but note the trichotomies in Legendre and Legendre (2012)."
As a rule of thumb: you want balanced trees when you want an even number of items assigned to each cluster. Unbalanced trees are useful for finding outliers — with this method you can find which items you might want to remove in order to achieve better clustering.
YOu can find additional explanations here: https://towardsdatascience.com/https-towardsdatascience-com-hierarchical-clustering-6f3c98c9d0ca.
# THE FOLLOWING IS THE ACTUAL CLUSTERING
clustered.data.ward <- hclust(distanceMatrixHC, method = "ward.D")
clustered.data.complete <- hclust(distanceMatrixHC, method = "complete")
clustered.data.average <- hclust(distanceMatrixHC, method = "average")
clustered.data.single <- hclust(distanceMatrixHC, method = "single")
str(clustered.data.ward)## List of 7
## $ merge : int [1:99, 1:2] -94 -13 -97 -33 -46 -93 -19 -12 -18 -34 ...
## $ height : num [1:99] 0.601 0.606 0.608 0.703 0.712 ...
## $ order : int [1:100] 19 53 55 56 64 65 15 52 77 48 ...
## $ labels : chr [1:100] "1864-02-08_advert_86" "1864-05-09_advert_66" "1864-04-05_death_70" "1864-03-25_article_98" ...
## $ method : chr "ward.D"
## $ call : language hclust(d = distanceMatrixHC, method = "ward.D")
## $ dist.method: NULL
## - attr(*, "class")= chr "hclust"
plot(clustered.data.ward, labels=FALSE, main="Ward")
abline(h=2, col="blue", lty=3)
rect.hclust(clustered.data.ward, k=4, border="red")You can use rect.hclust() either with h= — which will cut tree at a given height, thus determining the number of clusters; or, with k= — which will cut tree into a given number of clusters.
hclust() creates an object from which you can extract further information. For example, we can use cutree() function to extract clustering information. You can use cutree() either with h= — which will cut tree at a given height, thus determining the number of clusters; or, with k= — which will cut tree into a given number of clusters.
clusters_named_vector <- cutree(clustered.data.ward, h=0.3)
clusters_df <- stack(clusters_named_vector)
colnames(clusters_df) <- c("cluster", "id")
clusters_df <- clusters_df %>% select(id, cluster)
clusters_dfIn a nutshell, we repeatedly run clustering, increasing the number of clusters by one, and calculate the total within-cluster sum of square (wss). We then plot the curve of wss and look for a point in the curve with the sharpest bend (hence the “elbow”), which is considered to be an indicator of the appropriate number of clusters. library(factoextra) can perform this with a single command:
Ideally, we sould have something like L or V. Here results do not seem to be very helpful (perhaps, 3 is our optimal number). We can try another method — average silhouette method (which is also easily callable from library(factoextra)). Like with elbow method, we run clustering multiple times but here we measures the quality of a clustering, bu determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering.
set.seed(786)
fviz_nbclust(as.matrix(distanceMatrixHC), FUN = hcut, k.max = 10, method = "silhouette")More on hierarchical clustering: “Hierarchical Cluster Analysis”, in in U of Cincinnati Business Analytics R Programming Course http://uc-r.github.io/hc_clustering.
Let’s get a different sample from our data. With k-means clustering we can run on more data:
# RANDOMLY SELECT N ITEMS
set.seed(786)
N = 1000
sampledIDs <- sample(unique(df_TF_IDF$id), N)
sample_d1864_tfidf <- df_TF_IDF %>% filter(id %in% sampledIDs) %>%
select(id, word, tf_idf)
# CONVERT INTO DTM MATRIX
colnames(sample_d1864_tfidf) <- c("document", "term", "count")
sample <- tibble(sample_d1864_tfidf) %>% cast_dtm(document, term, count)
sample_matrix <- as.matrix(sample)
# CONVERT INTO REGULAR MATRIC AND CALCULATE DISTANCES
distanceMatrix <- dist.cosine(sample_matrix) # from library(stylo)
distanceMatrixKM <- as.dist(distanceMatrix)## List of 9
## $ cluster : Named int [1:1000] 5 5 5 5 5 5 5 5 5 5 ...
## ..- attr(*, "names")= chr [1:1000] "1864-06-02_article_47" "1864-10-10_printrun_6" "1864-05-09_advert_42" "1864-02-08_advert_120" ...
## $ centers : num [1:5, 1:1000] 1 1 0.999 0.991 0.996 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:5] "1" "2" "3" "4" ...
## .. ..$ : chr [1:1000] "1864-06-02_article_47" "1864-10-10_printrun_6" "1864-05-09_advert_42" "1864-02-08_advert_120" ...
## $ totss : num 1922
## $ withinss : num [1:5] 134 1.67 173.32 216.62 986.09
## $ tot.withinss: num 1512
## $ betweenss : num 410
## $ size : int [1:5] 49 11 118 144 678
## $ iter : int 5
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
We can extract cluster information in the same manner as with hclust() object:
kmeans_clusters_df <- stack(kmeansClusters$cluster)
colnames(kmeans_clusters_df) <- c("cluster", "id")
kmeans_clusters_df <- kmeans_clusters_df %>% select(id, cluster)
head(kmeans_clusters_df)We can vizualize the results of our kmeans clustering using fviz_cluster() (from factoextra). Our data is multidimensional — each word in our matrix is a single dimension, so this function will perform principal component analysis (PCA) and plot data according to the first two principal components that explain majority of the variance in our dataset.
In a nutshell, we repeatedly run clustering, increasing the number of clusters by one, and calculate the total within-cluster sum of square (wss). We then plot the curve of wss and look for a point in the curve with the sharpest bend (hence the “elbow”), which is considered to be an indicator of the appropriate number of clusters. library(factoextra) can perform this with a single command:
Ideally, we sould have something like L or V. Here results do not seem to be very helpful (perhaps, 3 is our optimal number). We can try another method — average silhouette method (which is also easily callable from library(factoextra)). Like with elbow method, we run clustering multiple times but here we measures the quality of a clustering, bu determining how well each object lies within its cluster. A high average silhouette width indicates a good clustering.
Let’s try to visialize our clusters again:
set.seed(786)
kmeansClusters <- kmeans(distanceMatrixKM, centers=3, nstart=25)
fviz_cluster(kmeansClusters, data = distanceMatrix, labelsize = 0)Here is, however, an example of where k-means clustering may/can fail: we gave a different starting point to the algorithm and
set.seed(786)
kmeansClusters <- kmeans(distanceMatrixKM, centers=3, nstart=50)
fviz_cluster(kmeansClusters, data = distanceMatrix, labelsize = 0)More information, see:
Although not cluster analysis techniques strictly speaking, PCA (principal component analysis) and MDS (multi-dimensional scaling) are used in similar ways. We will touch upon these in the context of stylometry.
We can start with our preprocessed variable d1864_clean, which is essentially a cumulative frequency list for all articles.
## <<DocumentTermMatrix (documents: 14246, terms: 30393)>>
## Non-/sparse entries: 1159214/431819464
## Sparsity : 100%
## Maximal term length: 32
## Weighting : term frequency (tf)
Training a model. NB: eval=FALSE setting will prevent from running this chunk when you Knit the notebook; but you can still execute it within the notebook, when you run chunks individually
tic()
d1864_lda <- LDA(d1864_dm, k = 4, control = list(seed = 1234))
d1864_lda
toc()
#A LDA_VEM topic model with 2 topics.
#35.962 sec elapsed
#A LDA_VEM topic model with 4 topics.
#72.545 sec elapsedDo not run this!
tic()
kVal <- 25
d1864_lda_better <- LDA(d1864_dm, k=kVal, control=list(seed=1234))
toc()
#A LDA_VEM topic model with 20 topics.
#1261.087 sec elapsed (21 minutes)
#A LDA_VEM topic model with 25 topics.
#1112.262 sec elapsed (18 minutes)Save/load the model, so that there is no need to regenerate it every time.
#d1864_lda_vem_25t_better <- d1864_lda_better
#save(d1864_lda_vem_25t_better, file="d1864_lda_vem_25t_better.rda")
#load(file="d1864_lda_vem_25t_better.rda")load(file="./data/d1864_lda_vem_25t_better.rda")
lda_model <- d1864_lda_vem_25t_better
corpus <- d1864From this point on, the code should simply run — if you rename your own model produced above to lda_model.
lda_model_better_topic_term_prob <- tidy(lda_model, matrix="beta")
lda_model_better_topic_term_prob %>%
filter(term == "bonds") %>%
arrange(desc(beta))NB: This step may throw an error. The error seems a bit cryptic, but restarting R (without saving workspace), seems to help. (beta stands for term-per-topic probability).
top_terms <- lda_model_better_topic_term_prob %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
arrange(topic, -beta)
head(top_terms)library(ggplot2)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend=FALSE) +
facet_wrap(~topic, scales = "free") +
coord_flip()Topic-per-document probabilities: this object will tell us to which topics documents belong (and to what extent): (gamma stands for topic-per-document probability).
Top N documents per topic: this will create an ojbect with top N documents per each topic.
N = 10
top_docs <- lda_model_topic_doc_prob %>%
group_by(topic) %>%
top_n(N, gamma) %>%
ungroup() %>%
arrange(topic, -gamma)
top_docsNow that we have IDs of representative documents, we can check those documents one by one, but le’ts do somthing else first: topic-title function—it is not really necessary, but it will combine together topic number (from the model) and its top words, which can be used for graphs.
topicVar <- function(top, topTerms){
topicVar <- topTerms %>%
filter(topic == top) %>%
arrange(-beta)
vals = paste(topicVar$term, collapse=", ")
as.String(paste(c("Topic ", top, ": ", vals), collapse=""))
}## Topic 8: treasury, bonds, notes, 1864, states, cent, certificates, confederate, secretary, department, america, richmond, payment, interest, treasurer
## Treasury Department, Confederate States of America, Richmond, August 8, 1864. Certificates of Indebtedness Bearing Six Per Cent. Per Annum interest and Free from Taxation. -- By the fourteenth section of the act to reduce the currency, approved February 17, 1864, the Secretary of the Treasury is authorized to issue the above certificates, payable two years after the ratification of a treaty of peace with the United States. They cannot be sold, but are only to be issued to such creditors of the Government as are willing to receive the same in payment of their demands. They must also be given at par, though free from taxation. The attention of purchasing agents and disbursing officers of the Government is called to this class of public securities as offering peculiar advantages to those from whom the supplies of the Government are bought; and to facilitate the use of them, checks drawn by disbursing officers upon the depositaries holding these funds, and marked across the face " payable in certificates of indebtedness," will be paid in conformity therewith. Depositaries are hereby authorized and required to comply with this regulation, and to make application to the Register for supplies of certificates as required. Signed) G. A. Trenholm, Secretary of Treasury. au 22 -- ts
corpus_light <- corpus %>%
select(-header, -text)
lda_model_topics <- lda_model_topic_doc_prob %>%
rename(id=document) %>%
left_join(corpus_light, by="id") %>%
group_by(topic, date) %>%
summarize(freq=sum(gamma))
lda_model_topicsNow, we can plot topic distribution over time:
topicVal = 8
lda_model_topics_final <- lda_model_topics %>%
filter(topic == topicVal)
plot(x=lda_model_topics_final$date, y=lda_model_topics_final$freq, type="l", lty=3, lwd=1,
main=topicVar(topicVal, top_terms),
xlab = "1864 - Dispatch coverage", ylab = "topic saliency")
segments(x0=lda_model_topics_final$date, x1=lda_model_topics_final$date, y0=0, y1=lda_model_topics_final$freq, lty=1, lwd=2)LDAvis offers a visual browser for topics, which has already became a very popular tool for this purpose. If everything is done right, a visualization similar to the one below should open in a browser.
LDAvis Browser Example.
We can use the following function that extracts all needed information from a model and converts it into a format that LDAvis expects:
library(LDAvis)
library(slam)
topicmodels2LDAvis <- function(x, ...){
post <- topicmodels::posterior(x)
if (ncol(post[["topics"]]) < 3) stop("The model must contain > 2 topics")
mat <- x@wordassignments
LDAvis::createJSON(
phi = post[["terms"]],
theta = post[["topics"]],
vocab = colnames(post[["terms"]]),
doc.length = slam::row_sums(mat, na.rm = TRUE),
term.frequency = slam::col_sums(mat, na.rm = TRUE)
)
}NB: there are some issues with LDAvis and for some reason it does not always parse out a topic model object. We can try loading another one, which does work: this is a 20 topic model based on issues of the Dispatch covering the period of 1861-1864.
This is simply a chunk of code that you can reuse for generating different distances. This code will not run because there is no variable YOUR-MATRIX!
#library(stylo)
# USING library(stylo) FUNCTIONS
if (distanceMethod == "cosine"){distanceMatrix <- dist.cosine(YOUR_MATRIX)
} else if (distanceMethod == "delta"){distanceMatrix <- dist.delta(YOUR_MATRIX)
} else if (distanceMethod == "argamon"){distanceMatrix <- dist.argamon(YOUR_MATRIX)
} else if (distanceMethod == "eder"){distanceMatrix <- dist.eder(YOUR_MATRIX)
} else if (distanceMethod == "minmax"){distanceMatrix <- dist.minmax(YOUR_MATRIX)
} else if (distanceMethod == "enthropy"){distanceMatrix <- dist.enthropy(YOUR_MATRIX)
} else if (distanceMethod == "simple"){distanceMatrix <- dist.simple(YOUR_MATRIX)
} else if (distanceMethod == "wurzburg"){distanceMatrix <- dist.wurzburg(YOUR_MATRIX)
# USING dist() FUNCTION
} else if (distanceMethod == "euclidean"){distanceMatrix <- dist(YOUR_MATRIX, method="euclidean")
} else if (distanceMethod == "maximum"){distanceMatrix <- dist(YOUR_MATRIX, method="maximum")
} else if (distanceMethod == "manhattan"){distanceMatrix <- dist(YOUR_MATRIX, method="manhattan")
} else if (distanceMethod == "canberra"){distanceMatrix <- dist(YOUR_MATRIX, method="canberra")
} else if (distanceMethod == "binary"){distanceMatrix <- dist(YOUR_MATRIX, method="binary")
} else if (distanceMethod == "minkowski"){distanceMatrix <- dist(YOUR_MATRIX, method="minkowski")
distanceMatrix <- as.dist(distanceMatrix)
clustered.data <- hclust(distanceMatrix, method = clusteringMethod)